{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "os.environ['CUDA_VISIBLE_DEVICES'] = ''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import tensorflow as tf\n",
    "import tensorflow_datasets as tfds\n",
    "from t5.data import preprocessors as prep\n",
    "import functools\n",
    "import t5\n",
    "import gin\n",
    "import sentencepiece as spm\n",
    "from glob import glob\n",
    "import os\n",
    "\n",
    "gin.parse_config_file('pretrained_models_base_operative_config.gin')\n",
    "vocab = 'sp10m.cased.ms-en.model'\n",
    "sp = spm.SentencePieceProcessor()\n",
    "sp.Load(vocab)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "files = [\n",
    "    '/home/husein/pure-text/dumping-iium.txt',\n",
    "    '/home/husein/pure-text/dumping-twitter.txt',\n",
    "    '/home/husein/pure-text/dumping-instagram.txt'\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'dumping-iium.txt'"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "os.path.split(files[0])[1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "\n",
    "def cleaning(string):\n",
    "    string = string.replace('\\n', ' ').replace('\\t', ' ')\n",
    "    string = re.sub(r'[ ]+', ' ', string).strip()\n",
    "    return string"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/home/husein/pure-text/dumping-iium.txt 1121978\n",
      "/home/husein/pure-text/dumping-twitter.txt 10692819\n",
      "/home/husein/pure-text/dumping-instagram.txt 3226766\n"
     ]
    }
   ],
   "source": [
    "for file in files:\n",
    "    with open(file) as fopen:\n",
    "        data = list(filter(None, fopen.read().split('\\n')))\n",
    "    print(file, len(data))\n",
    "    s = os.path.split(file)[1]\n",
    "    filename = f'{s}.tsv'\n",
    "    with tf.io.gfile.GFile(filename, 'w') as outfile:\n",
    "        for i in range(len(data)):\n",
    "            c = cleaning(data[i])\n",
    "            if len(c) > 20:\n",
    "                outfile.write('%s\\t%s\\n' % ('', c))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "def dumping_dataset(split, shuffle_files = False):\n",
    "    del shuffle_files\n",
    "    ds = tf.data.TextLineDataset(\n",
    "        ['dumping-iium.txt.tsv', 'dumping-twitter.txt.tsv']\n",
    "    )\n",
    "\n",
    "    ds = ds.map(\n",
    "        functools.partial(\n",
    "            tf.io.decode_csv,\n",
    "            record_defaults = ['', ''],\n",
    "            field_delim = '\\t',\n",
    "            use_quote_delim = False,\n",
    "        ),\n",
    "        num_parallel_calls = tf.data.experimental.AUTOTUNE,\n",
    "    )\n",
    "    ds = ds.map(lambda *ex: dict(zip(['title', 'text'], ex)))\n",
    "    return ds"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "t5.data.TaskRegistry.remove('dumping_dataset')\n",
    "t5.data.TaskRegistry.add(\n",
    "    'dumping_dataset',\n",
    "    dataset_fn = dumping_dataset,\n",
    "    splits = ['train'],\n",
    "    text_preprocessor = functools.partial(\n",
    "        t5.data.preprocessors.rekey,\n",
    "        key_map = {'inputs': None, 'targets': 'text'},\n",
    "    ),\n",
    "    token_preprocessor = t5.data.preprocessors.unsupervised,\n",
    "    sentencepiece_model_path = vocab,\n",
    "    metric_fns = [],\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:tokens_length=1137 inputs_length=1024 targets_length=229 noise_density=0.15 mean_noise_span_length=3.0 \n"
     ]
    }
   ],
   "source": [
    "nq_task = t5.data.TaskRegistry.get(\"dumping_dataset\")\n",
    "ds = nq_task.get_dataset(split='qa.tsv', sequence_length={\"inputs\": 1024, \"targets\": 1024})\n",
    "r = tfds.as_numpy(ds)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'inputs': array([  203, 27499,     3,  9827,   266,   420, 32099,   108,   486,\n",
       "        13597,     3, 32098,   103,   322,   280,     3,  1236,   354,\n",
       "         2202,  2683,    25,  6572, 10897,    14,  1887,   103,    13,\n",
       "        21206,   205,  9132,    13, 11965,     3,   335,   155,    17,\n",
       "          200,     3,  1616,    17,   736,    14, 20086, 12231,    13,\n",
       "        11965,     3,  2569,    13, 21206,   205,  9132,   200,  6765,\n",
       "            3, 16602,   317,   802,  1428,     3, 20300,   802,  1330,\n",
       "         1229,     3, 20300,    24, 32097,  1830,   271,  1817,   140,\n",
       "         1644,     3,  5746,  1518, 17294,    14,    13,   117, 32096,\n",
       "           14,    13, 17235,  4878, 32095, 10294,    24,    50, 32094,\n",
       "           13,  2906, 10891,  7277,    14,   802,  3249,     3, 16602,\n",
       "          317,    97,    24,  1231,   279, 32093,     3, 16602,   317,\n",
       "          532,   987,  3871,  4755,     3, 20300, 10294, 12120,   117,\n",
       "         2695, 10773,    41,   176,     3, 32092,   100,   155, 32091,\n",
       "        13719, 15647,   215, 32090,  1062,  3420, 10294,   987,  3871,\n",
       "         4755,  6311,  5132,     3, 32089,    13,   940,     3,  9100,\n",
       "         6357,   103, 10294, 29293,    24,   234,   116,     3,    13,\n",
       "        25364,    14,  2427,  2303,  2105,    14, 10294,    90, 32088,\n",
       "        11871,    41,  1042,     3,  5713,  2013, 10294, 26142,   104,\n",
       "        10215, 18980,     3,  1972,  1131, 10294,   155,    13,  1278,\n",
       "           14, 10294,  7958,  8236, 14277, 10294,    25,    13, 21825,\n",
       "         2496, 10294,     3,  8332,  1118,    14,  6900,  2013, 10294,\n",
       "         2929,    14,  6376,   382,   765,    61,   704,    24,    13,\n",
       "        21825,  7907,    38,     3,   686,   496,   185, 10294,    13,\n",
       "        14002,  2496, 10294,    14, 14982,    13,  1278,  2013, 32087,\n",
       "           13, 14002,  7737,     3,   448,  3343,     3,     3,     3,\n",
       "        16602,  9755,    14, 10294,    13,   940,  2722, 27259,    13,\n",
       "        28330,     3, 12342,    50,  1284,   143, 32086,     3,  8973,\n",
       "        11273, 32085,   155,   802,    24,  1428,    14,  5816,   108,\n",
       "           17, 10294,  2366,     3,  8332,   804,   464,  7958,  6421,\n",
       "         6997,    14,   464,    13,  6183, 12665,    13,    79,   353,\n",
       "          152,    73, 32084, 15998, 32083,  3736,  2921,     3, 32082,\n",
       "           17,   627,   802,    14,    17, 15799, 10087,    14,    17,\n",
       "         7958,  1887, 12041,    14,  3292,  4704,     3, 22326, 32081,\n",
       "         2030,    13,  1278,     3,   418,  9079,  1191,  5008, 21885,\n",
       "           14,  9036,    73,   116,   528, 26260,    24,   234,   802,\n",
       "            3,  6185,   342, 27710,   329,  2427,   177,    24,   188,\n",
       "        32080,    13,  1278,    17,  7958,  6003, 32079, 20241, 15760,\n",
       "           73,    17,   292,     3, 32078,    25,  8690,   591,    61,\n",
       "           77, 12176,    98,    14,  2202,    29,   189,    13, 21206,\n",
       "          205,  9132,     3,  1865,  2257,    14, 10492,    14,    13,\n",
       "        15359,  1956,    14, 12203,    14,   177,   215,    13,   765,\n",
       "           17,  7958,    24,  1661,     3,    13, 21227,    29, 32077,\n",
       "         5008,     3,  3644,   210,   289,   932,    78,    54, 32076,\n",
       "           14, 10800,   103,     3,  5069,  1553,   750,  5784,    13,\n",
       "         1278,     3,   249,  6003,  1596,   765,  5784,    13,  1278,\n",
       "           55,  1847,  3023,     3,  5069,   802,    13,  2135,   496,\n",
       "          215, 19084,    14,  5816,  3899, 14982,   215,     3,   680,\n",
       "          371,  5008,   776,     3,  2179,    15,  1297,  1071,   445,\n",
       "         3917,    18,    15, 29126,     3,  7351,  1963,    14,    13,\n",
       "        22970,  7935, 22633, 32075,     1]),\n",
       " 'targets': array([32099,    13,   940, 32098, 24472, 32097, 11200, 32096,  1987,\n",
       "         6000, 32095,     3,    13, 21421,  1593,   188, 32094, 11631,\n",
       "           14,    19,  2482, 32093,   292, 32092, 28225, 32091,    14,\n",
       "        10294, 32090,    14,    13,   728, 32089,  1236,   354,   693,\n",
       "          103,    13,   765, 10294,    42,   234, 32088,  2191,    55,\n",
       "           13, 32087, 10294,    24, 32086,    24,  1210,  5049,    55,\n",
       "          218, 32085,   528,  2929, 32084,    17,    13, 32083,  2474,\n",
       "           20,  3903, 32082,    13, 22070, 10294,    25,  8513,   215,\n",
       "        32081,  2683, 32080,     3, 25702, 32079,  1596,   115,     3,\n",
       "           13, 32078,  1157,    17,    61,  1994, 32077,   210,    51,\n",
       "         9367, 32076,     3, 19268,  4082,    14, 10372, 32075, 10294,\n",
       "         1110,     3,     1])}"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "next(r)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
